data_dir='/Users/robert/Documents/UMN/air_pollution/'
DF <- read.csv(paste0(data_dir, 'texas.csv'), stringsAsFactors=FALSE)
DF$Date <- ymd(DF$Date_Local)
DF$Date_Local <- NULL
DF$Year <- year(DF$Date)
DF$Longitude <- as.numeric(DF$Longitude)
DF$Latitude <- as.numeric(DF$Latitude)
Describe the data:
describe(DF)
## DF
##
## 16 Variables 4119327 Observations
## ---------------------------------------------------------------------------
## State_Code
## n missing unique Info Mean
## 4119327 0 1 0 48
## ---------------------------------------------------------------------------
## County_Code
## n missing unique Info Mean .05 .10 .25 .50
## 4119327 0 29 0.99 204.3 29 29 113 201
## .75 .90 .95
## 303 439 453
##
## lowest : 29 43 61 113 121, highest: 375 439 453 479 485
## ---------------------------------------------------------------------------
## Site_Num
## n missing unique Info Mean .05 .10 .25 .50
## 4119327 0 54 1 387.8 3 5 24 78
## .75 .90 .95
## 677 1042 1051
##
## lowest : 1 2 3 4 5, highest: 2004 3003 3008 3009 3011
## ---------------------------------------------------------------------------
## Latitude
## n missing unique Info Mean .05 .10 .25 .50
## 4119327 0 66 1 30.4 27.43 28.70 29.30 29.90
## .75 .90 .95
## 31.84 32.76 33.59
##
## lowest : 25.89 26.07 26.23 27.43 27.60
## highest: 33.59 33.59 33.86 35.20 35.21
## ---------------------------------------------------------------------------
## Longitude
## n missing unique Info Mean .05 .10 .25 .50
## 4119327 0 66 1 -98.36 -106.40 -103.18 -100.45 -97.43
## .75 .90 .95
## -95.33 -94.86 -94.09
##
## lowest : -106.50 -106.50 -106.46 -106.40 -106.29
## highest: -94.17 -94.09 -94.08 -93.91 -93.87
## ---------------------------------------------------------------------------
## Time_Local
## n missing unique Info Mean .05 .10 .25 .50
## 4119327 0 24 1 11.5 1 2 5 11
## .75 .90 .95
## 18 21 22
##
## lowest : 0 1 2 3 4, highest: 19 20 21 22 23
## ---------------------------------------------------------------------------
## pm25
## n missing unique Info Mean .05 .10 .25 .50
## 4119327 0 1671 1 9.514 2.1 3.0 5.1 8.1
## .75 .90 .95
## 12.2 17.4 21.5
##
## lowest : -2.1 -1.9 -1.8 -1.7 -1.4
## highest: 617.1 719.4 755.6 757.0 821.9
## ---------------------------------------------------------------------------
## pm10
## n missing unique Info Mean .05 .10 .25 .50
## 316588 3802739 769 1 28.27 7 9 14 21
## .75 .90 .95
## 31 48 67
##
## lowest : -4 -3 -2 -1 0, highest: 4186 4419 4526 4686 4739
## ---------------------------------------------------------------------------
## wind_Knots
## n missing unique Info Mean .05 .10 .25 .50
## 3356199 763128 402 1 5.838 1.1 1.7 3.1 5.2
## .75 .90 .95
## 7.8 10.7 12.7
##
## lowest : 0.00 0.05 0.10 0.20 0.30
## highest: 42.60 43.00 43.10 43.60 44.00
## ---------------------------------------------------------------------------
## wind_Degrees_Compass
## n missing unique Info Mean .05 .10 .25 .50
## 3356199 763128 3602 1 170 20.0 42.0 111.3 160.1
## .75 .90 .95
## 222.9 319.0 340.0
##
## lowest : 0.00 0.05 0.10 0.20 0.30
## highest: 359.60 359.70 359.80 359.90 360.00
## ---------------------------------------------------------------------------
## temperature
## n missing unique Info Mean .05 .10 .25 .50
## 3377228 742099 113 1 68.93 40 46 58 71
## .75 .90 .95
## 81 88 92
##
## lowest : 1 2 3 4 5, highest: 109 110 111 112 114
## ---------------------------------------------------------------------------
## pressure
## n missing unique Info Mean .05 .10 .25 .50
## 265992 3853335 109 1 998.8 889 893 1010 1015
## .75 .90 .95
## 1020 1025 1028
##
## lowest : 871 872 873 874 875, highest: 1044 1045 1046 1047 1048
## ---------------------------------------------------------------------------
## RH_Dewpoint_Percent_relative_humidity
## n missing unique Info Mean .05 .10 .25 .50
## 971442 3147885 100 1 60.29 18 26 43 64
## .75 .90 .95
## 80 89 92
##
## lowest : 1 2 3 4 5, highest: 96 97 98 99 100
## ---------------------------------------------------------------------------
## RH_Dewpoint_Degrees_Fahrenheit
## n missing unique Info Mean .05 .10 .25 .50
## 971442 3147885 86 1 51.49 18 24 37 56
## .75 .90 .95
## 67 72 74
##
## lowest : 0.00 0.05 1.00 2.00 3.00
## highest: 80.00 81.00 82.00 84.00 94.00
## ---------------------------------------------------------------------------
## Date
## n missing unique Info Mean .05
## 4119327 0 3591 1 2010-02-22 2005-07-29
## .10 .25 .50 .75 .90 .95
## 2006-03-10 2007-11-20 2010-03-22 2012-07-16 2013-11-17 2014-04-25
##
## lowest : 2005-01-01 2005-01-02 2005-01-03 2005-01-04 2005-01-05
## highest: 2014-10-27 2014-10-28 2014-10-29 2014-10-30 2014-10-31
## ---------------------------------------------------------------------------
## Year
## n missing unique Info Mean .05 .10 .25 .50
## 4119327 0 10 0.99 2010 2005 2006 2007 2010
## .75 .90 .95
## 2012 2013 2014
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013
## Frequency 347211 345748 380542 435660 448561 455735 437691 442011 472342
## % 8 8 9 11 11 11 11 11 11
## 2014
## Frequency 353826
## % 9
## ---------------------------------------------------------------------------
Remove locations with missing data
by_site <- DF %>% group_by(County_Code, Site_Num, Year) %>%
summarise(avg_pm = mean(pm25),
pm10_missing = sum(is.na(pm10))/length(Site_Num),
pressure_missing = sum(is.na(pressure))/length(Site_Num),
humidity_missing = sum(is.na(RH_Dewpoint_Percent_relative_humidity))/length(Site_Num),
dewpoint_degreesf_missing = sum(is.na(RH_Dewpoint_Degrees_Fahrenheit))/length(Site_Num),
wind_speed_missing = sum(is.na(wind_Knots))/length(Site_Num),
wind_direction_missing = sum(is.na(wind_Degrees_Compass)) / length(Site_Num),
temp_missing = sum(is.na(temperature)) / length(Site_Num),
count = length(County_Code))
# only look at sites with less than 10% missing in some year
non_missing = unique(by_site[by_site$wind_speed_missing < .1 &
by_site$wind_direction_missing < .1 &
by_site$temp_missing < .1, c("County_Code", "Site_Num")])
DF <- inner_join(DF, non_missing, by = c("County_Code", "Site_Num"))
map locations:
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.1.3
map <- get_map(location = 'texas', zoom=6, color="bw")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=texas&zoom=6&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=texas&sensor=false
p <- ggmap(map)
agg_df <- DF %>%
group_by(Latitude, Longitude) %>%
summarise(ct = length(pm25))
p + geom_point(data=agg_df, aes(x=Longitude, y=Latitude), color="red")
cluster locations
n_clusters <- 6
X <- as.matrix(agg_df[,1:2])
# K-Means clustering, scale the data matrix.
clusters <- kmeans(scale(X), n_clusters, iter.max = 25)
# Add cluster label to the data frame.
agg_df$cluster <- factor(clusters$cluster)
# Examine the cluster averages (use unlogged, unscaled variable values).
centers <- agg_df %>% group_by(cluster) %>%
summarise(Latitude = mean(Latitude),
Longitude = mean(Longitude),
num_unique = length(ct),
num = sum(ct))
centers <- centers[order(centers$num_unique, decreasing=TRUE),]
print(xtable(centers), type='html')
| cluster | Latitude | Longitude | num_unique | num | |
|---|---|---|---|---|---|
| 1 | 4 | 29.77 | -94.93 | 14 | 835815 |
| 2 | 6 | 32.53 | -96.67 | 13 | 683680 |
| 3 | 5 | 29.56 | -99.25 | 12 | 1042468 |
| 4 | 3 | 31.64 | -105.11 | 8 | 445118 |
| 5 | 2 | 26.87 | -97.52 | 6 | 332918 |
| 6 | 1 | 33.68 | -100.76 | 3 | 124694 |
plot clusters
p + geom_point(data=agg_df, aes(x=Longitude, y=Latitude, colour=cluster), size=3) +
theme(legend.position = "none")
sizable_clusters <- centers[centers$num_unique > 4,]
plot_df <- agg_df[agg_df$cluster %in% sizable_clusters$cluster,]
p + geom_hex(data=plot_df,
aes(x=Longitude, y=Latitude, fill=cluster),
colour = "white", alpha = .6) +
geom_point(data = sizable_clusters,
aes(x = Longitude,
y = Latitude),
colour = "black",
alpha=.6,
size = 16) +
geom_point(data = sizable_clusters,
aes(x = Longitude,
y = Latitude,
colour = cluster),
alpha=.6,
size = 15) +
geom_text(data = sizable_clusters,
aes(x = Longitude,
y = Latitude,
label = paste0("#", cluster, ": ", num_unique)),
size=4,
colour = "black") +
labs(title = "Spatial Clusters") +
theme(legend.position = "none")
zoom in on largest cluster:
biggest_cluster <- agg_df[agg_df$cluster %in% centers$cluster[1],]
area <- get_map(location = c(lon=mean(biggest_cluster$Longitude),
lat=mean(biggest_cluster$Latitude)),
zoom=9,
color="bw")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=29.770063,-94.930636&zoom=9&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
p <- ggmap(area)
p + geom_point(data=biggest_cluster, aes(x=Longitude, y=Latitude), color="red")
Below is a summary of the data in this location
area_df = DF[DF$Latitude >= min(biggest_cluster$Latitude) &
DF$Latitude <= max(biggest_cluster$Latitude) &
DF$Longitude >= min(biggest_cluster$Longitude) &
DF$Longitude <= max(biggest_cluster$Longitude),]
area_df$County_Code <- factor(area_df$County_Code)
area_df$Site_Num <- factor(area_df$Site_Num)
area_df$Year <- factor(area_df$Year)
area_df$State_Code <- NULL
area_df$pm10 <- NULL
describe(area_df)
## area_df
##
## 14 Variables 835815 Observations
## ---------------------------------------------------------------------------
## County_Code
## n missing unique
## 835815 0 4
##
## 167 (77007, 9%), 201 (546210, 65%), 245 (137177, 16%)
## 339 (75421, 9%)
## ---------------------------------------------------------------------------
## Site_Num
## n missing unique
## 835815 0 12
##
## 14 20 22 24 26 58 78 416 1034 1035
## Frequency 18811 28563 79253 79738 68641 9423 75421 63420 138923 81740
## % 2 3 9 10 8 1 9 8 17 10
## 1039 1050
## Frequency 81770 110112
## % 10 13
## ---------------------------------------------------------------------------
## Latitude
## n missing unique Info Mean .05 .10 .25 .50
## 835815 0 14 0.99 29.78 29.25 29.58 29.67 29.77
## .75 .90 .95
## 29.90 30.07 30.35
##
## 29.254474 29.263319 29.583047 29.670025 29.686389 29.733726
## Frequency 58196 18811 80751 81770 63420 81740
## % 7 2 10 10 8 10
## 29.767971 29.770698 29.802707 29.863953 29.901037 30.06607
## Frequency 80727 9423 68641 79253 79738 28563
## % 10 1 8 9 10 3
## 30.06717 30.350302
## Frequency 29361 75421
## % 4 9
## ---------------------------------------------------------------------------
## Longitude
## n missing unique Info Mean .05 .10 .25 .50
## 835815 0 14 0.99 -95.02 -95.43 -95.33 -95.29 -95.13
## .75 .90 .95
## -94.86 -94.32 -94.09
##
## -95.425128 -95.326125 -95.294722 -95.257593 -95.220587
## Frequency 75421 79738 63420 81740 80727
## % 9 10 8 10 10
## -95.128508 -95.125495 -95.031232 -95.015544 -94.861289
## Frequency 81770 68641 9423 80751 58196
## % 10 8 1 10 7
## -94.856568 -94.3178 -94.09093 -94.077383
## Frequency 18811 79253 29361 28563
## % 2 9 4 3
## ---------------------------------------------------------------------------
## Time_Local
## n missing unique Info Mean .05 .10 .25 .50
## 835815 0 24 1 11.51 1 2 5 12
## .75 .90 .95
## 18 21 22
##
## lowest : 0 1 2 3 4, highest: 19 20 21 22 23
## ---------------------------------------------------------------------------
## pm25
## n missing unique Info Mean .05 .10 .25 .50
## 835815 0 928 1 11.06 3.0 4.2 6.5 9.7
## .75 .90 .95
## 14.1 19.6 23.7
##
## lowest : -1.8 -1.4 -1.2 -1.1 -1.0
## highest: 281.8 282.0 310.0 358.2 755.6
## ---------------------------------------------------------------------------
## wind_Knots
## n missing unique Info Mean .05 .10 .25 .50
## 827008 8807 276 1 5.222 1.0 1.5 2.8 4.8
## .75 .90 .95
## 7.1 9.5 11.0
##
## lowest : 0.00 0.05 0.10 0.20 0.30
## highest: 27.90 28.00 28.80 28.90 32.40
## ---------------------------------------------------------------------------
## wind_Degrees_Compass
## n missing unique Info Mean .05 .10 .25 .50
## 827008 8807 3602 1 163.3 17.0 34.0 100.4 157.0
## .75 .90 .95
## 212.0 321.4 342.5
##
## lowest : 0.00 0.05 0.10 0.20 0.30
## highest: 359.60 359.70 359.80 359.90 360.00
## ---------------------------------------------------------------------------
## temperature
## n missing unique Info Mean .05 .10 .25 .50
## 829185 6630 92 1 70.25 44 50 61 73
## .75 .90 .95
## 81 86 89
##
## lowest : 18 19 20 21 22, highest: 105 106 107 110 114
## ---------------------------------------------------------------------------
## pressure
## n missing unique Info Mean .05 .10 .25 .50
## 221958 613857 56 1 1017 1008 1010 1013 1017
## .75 .90 .95
## 1021 1026 1028
##
## lowest : 992 994 995 996 997, highest: 1044 1045 1046 1047 1048
## ---------------------------------------------------------------------------
## RH_Dewpoint_Percent_relative_humidity
## n missing unique Info Mean .05 .10 .25 .50
## 436860 398955 97 1 69.53 34 43 58 73
## .75 .90 .95
## 84 90 93
##
## lowest : 4 5 6 7 8, highest: 96 97 98 99 100
## ---------------------------------------------------------------------------
## RH_Dewpoint_Degrees_Fahrenheit
## n missing unique Info Mean .05 .10 .25 .50
## 436860 398955 85 1 58.96 29 36 49 64
## .75 .90 .95
## 71 74 75
##
## lowest : 0.00 0.05 1.00 2.00 3.00
## highest: 79.00 80.00 81.00 82.00 84.00
## ---------------------------------------------------------------------------
## Date
## n missing unique Info Mean .05
## 835815 0 3554 1 2009-11-26 2005-07-02
## .10 .25 .50 .75 .90 .95
## 2006-01-19 2007-06-13 2009-11-18 2012-05-21 2013-10-30 2014-04-13
##
## lowest : 2005-01-01 2005-01-02 2005-01-03 2005-01-04 2005-01-05
## highest: 2014-09-26 2014-09-27 2014-09-28 2014-09-29 2014-09-30
## ---------------------------------------------------------------------------
## Year
## n missing unique
## 835815 0 10
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## Frequency 79589 88143 92553 85834 81709 81517 83636 86026 89300 67508
## % 10 11 11 10 10 10 10 10 11 8
## ---------------------------------------------------------------------------
The area containing this cluster has Longitude = (-95.43, -94.08), and Latitude = (29.25, -)
Examine trends across years, month, hour, week, location, etc.
What are the county codes?
uniq_locations = unique(area_df[,c("Longitude", "Latitude", "County_Code")])
p + geom_point(data=uniq_locations, aes(x=Longitude, y=Latitude, colour=County_Code), size=5) +
theme(legend.position = "bottom")
Summaries by county
by_county <- area_df %>% group_by(County_Code) %>%
summarise(avg_pm = mean(pm25),
avg_wind_speed = mean(wind_Knots, na.rm=TRUE),
wind_speed_missing = sum(is.na(wind_Knots))/length(wind_Knots),
avg_wind_direction = mean(wind_Degrees_Compass, na.rm=TRUE),
wind_direction_missing = sum(is.na(wind_Degrees_Compass)) / length(wind_Degrees_Compass),
avg_temp = mean(temperature, na.rm=TRUE),
temp_missing = sum(is.na(temperature)) / length(temperature),
count = length(County_Code))
print(xtable(by_county), type='html')
## <!-- html table generated in R 3.1.0 by xtable 1.7-4 package -->
## <!-- Wed Jun 3 16:39:58 2015 -->
## <table border=1>
## <tr> <th> </th> <th> County_Code </th> <th> avg_pm </th> <th> avg_wind_speed </th> <th> wind_speed_missing </th> <th> avg_wind_direction </th> <th> wind_direction_missing </th> <th> avg_temp </th> <th> temp_missing </th> <th> count </th> </tr>
## <tr> <td align="right"> 1 </td> <td> 167 </td> <td align="right"> 9.25 </td> <td align="right"> 8.92 </td> <td align="right"> 0.03 </td> <td align="right"> 152.09 </td> <td align="right"> 0.03 </td> <td align="right"> 70.75 </td> <td align="right"> 0.02 </td> <td align="right"> 77007 </td> </tr>
## <tr> <td align="right"> 2 </td> <td> 201 </td> <td align="right"> 11.59 </td> <td align="right"> 4.83 </td> <td align="right"> 0.01 </td> <td align="right"> 164.00 </td> <td align="right"> 0.01 </td> <td align="right"> 70.58 </td> <td align="right"> 0.01 </td> <td align="right"> 546210 </td> </tr>
## <tr> <td align="right"> 3 </td> <td> 245 </td> <td align="right"> 10.14 </td> <td align="right"> 4.99 </td> <td align="right"> 0.00 </td> <td align="right"> 169.98 </td> <td align="right"> 0.00 </td> <td align="right"> 69.86 </td> <td align="right"> 0.00 </td> <td align="right"> 137177 </td> </tr>
## <tr> <td align="right"> 4 </td> <td> 339 </td> <td align="right"> 10.75 </td> <td align="right"> 4.80 </td> <td align="right"> 0.01 </td> <td align="right"> 157.37 </td> <td align="right"> 0.01 </td> <td align="right"> 68.01 </td> <td align="right"> 0.01 </td> <td align="right"> 75421 </td> </tr>
## </table>
Plot wind speeds and directions
plot_df <- area_df[is.na(area_df$wind_Knots) == FALSE &
is.na(area_df$wind_Degrees_Compass) == FALSE,]
spdseq <- seq(min(plot_df$wind_Knots, na.rm=TRUE), max(plot_df$wind_Knots, na.rm=TRUE), 6)
# get some information about the number of bins, etc.
n.spd.seq <- length(spdseq)
n.colors.in.range <- n.spd.seq - 1
# create the color map
require(RColorBrewer)
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.1.2
spd.colors <- colorRampPalette(brewer.pal(
min(
max(3,n.colors.in.range),
min(9, n.colors.in.range)
),
"YlGnBu"))(n.colors.in.range)
spd.breaks <- spdseq
spd.labels <- paste(c(spdseq[1:n.spd.seq-1]),
'-',
c(spdseq[2:n.spd.seq]))
plot_df$spd.binned <- cut(x = plot_df$wind_Knots,
breaks = spd.breaks,
labels = spd.labels,
ordered_result = TRUE)
# figure out the wind direction bins
dirres=30
dir.breaks <- c(-dirres/2,
seq(dirres/2, 360-dirres/2, by = dirres),
360+dirres/2)
dir.labels <- c(paste(360-dirres/2,"-",dirres/2),
paste(seq(dirres/2, 360-3*dirres/2, by = dirres),
"-",
seq(3*dirres/2, 360-dirres/2, by = dirres)),
paste(360-dirres/2,"-",dirres/2))
# assign each wind direction to a bin
dir.binned <- cut(plot_df$wind_Degrees_Compass,
breaks = dir.breaks,
ordered_result = TRUE)
levels(dir.binned) <- dir.labels
plot_df$dir.binned <- dir.binned
windrose <- ggplot(data = plot_df,
aes(x = dir.binned,
fill = spd.binned)) +
geom_bar() +
scale_x_discrete(drop = FALSE,
labels = waiver()) +
coord_polar(start = -((dirres/2)/360) * 2*pi) +
scale_fill_manual(name = "Wind Speed (m/s)",
values = spd.colors,
drop = FALSE) +
theme(axis.title.x = element_blank(), legend.position="bottom")
print(windrose)
plot_df <- area_df %>% group_by(Year) %>%
summarise(avg = mean(pm25),
stdev = sd(pm25,na.rm=TRUE))
limits <- aes(ymax = avg + stdev, ymin=avg - stdev)
dodge <- position_dodge(width=0.9)
ggplot(plot_df, aes(x=Year, y=avg)) +
geom_bar(stat="identity", fill="blue", alpha = .5) +
geom_errorbar(limits, position=dodge, width=0.25) +
theme_bw()
plot_df <- area_df %>% group_by(County_Code) %>%
summarise(avg = mean(pm25),
stdev = sd(pm25,na.rm=TRUE))
limits <- aes(ymax = avg + stdev, ymin=avg - stdev)
dodge <- position_dodge(width=0.9)
ggplot(plot_df, aes(x=County_Code, y=avg)) +
geom_bar(stat="identity", fill="blue", alpha = .5) +
geom_errorbar(limits, position=dodge, width=0.25) +
theme_bw()
plot_df <- area_df %>% group_by(Site_Num) %>%
summarise(avg = mean(pm25),
stdev = sd(pm25,na.rm=TRUE))
limits <- aes(ymax = avg + stdev, ymin=avg - stdev)
dodge <- position_dodge(width=0.9)
ggplot(plot_df, aes(x=Site_Num, y=avg)) +
geom_bar(stat="identity", fill="blue", alpha = .5) +
geom_errorbar(limits, position=dodge, width=0.25) +
theme_bw()
plot_df <- area_df %>% group_by(Time_Local) %>%
summarise(avg = mean(pm25),
stdev = sd(pm25,na.rm=TRUE))
limits <- aes(ymax = avg + stdev, ymin=avg - stdev)
dodge <- position_dodge(width=0.9)
ggplot(plot_df, aes(x=Time_Local, y=avg)) +
geom_bar(stat="identity", fill="blue", alpha = .5) +
geom_errorbar(limits, position=dodge, width=0.25) +
theme_bw()